suppressPackageStartupMessages({
# library(pbapply)
library(pbmcapply)
library(parallel)
library(microbenchmark)
library(dplyr)
library(plotly)
})
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_unnamed' by
## 'rlang::check_dots_unnamed' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_used' by
## 'rlang::check_dots_used' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_empty' by
## 'rlang::check_dots_empty' when loading 'tibble'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_unnamed' by
## 'rlang::check_dots_unnamed' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_used' by
## 'rlang::check_dots_used' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_empty' by
## 'rlang::check_dots_empty' when loading 'pillar'
# load engines' functions
sapply(FUN = source,
X = list.files("../src",
pattern = ".R$",
full.names = T))
# R options
options(max.print = 20)
We want to check the time taken by the crossing simulation according to genetic data size and number of crosses.
The variable that can influence the most the simulation time are:
calc_progenyBlupEstimation)phasedGenoFile <- '../data/geno/breedGame_phasedGeno.vcf.gz'
SNPcoordFile <- '../data/SNPcoordinates/breedingGame_SNPcoord.csv'
markerEffectsFile <- '../data/markerEffects/breedGame_markerEffects.json'
g <- readPhasedGeno(phasedGenoFile)
SNPcoord <- readSNPcoord(SNPcoordFile)
markerEffects <- readMarkerEffects(markerEffectsFile)
## 2023-02-28 16:37:56 - r-readPhasedGeno(): Check file extention ...
## 2023-02-28 16:37:56 - r-readPhasedGeno(): Read geno file ...
## 2023-02-28 16:37:59 - r-readPhasedGeno(): Read phased geno file DONE
## 2023-02-28 16:37:59 - r-readPhasedGeno(): Extract SNP information...
## 2023-02-28 16:37:59 - r-readPhasedGeno(): Extract SNP information DONE
## 2023-02-28 16:37:59 - r-readPhasedGeno(): Check pahsing ...
## 2023-02-28 16:37:59 - r-readPhasedGeno(): Check pahsing DONE
## 2023-02-28 16:37:59 - r-readPhasedGeno(): Extract haplotypes...
## 2023-02-28 16:38:03 - r-readPhasedGeno(): Extract haplotypes DONE
## 2023-02-28 16:38:03 - r-readPhasedGeno(): DONE, return output.
## 2023-02-28 16:38:03 - r-readSNPcoord(): Read snps coordinates file ...
## 2023-02-28 16:38:03 - r-readSNPcoord(): Read snps coordinates file DONE
## 2023-02-28 16:38:03 - r-readSNPcoord(): Check snps coordinates file ...
## 2023-02-28 16:38:03 - r-readSNPcoord(): Check snps coordinates file DONE
## 2023-02-28 16:38:03 - r-readMarkerEffects(): Marker effects file ...
## 2023-02-28 16:38:03 - r-readMarkerEffects_json(): Read marker effects file ...
## 2023-02-28 16:38:03 - r-readMarkerEffects_json(): Read marker effects file DONE
## 2023-02-28 16:38:03 - r-readMarkerEffects_json(): Check marker effects file ...
## 2023-02-28 16:38:03 - r-readMarkerEffects_json(): Check marker effects file DONE
We have at our disposal:
nCores <- 10 # number of cores on which the simulations are run in parallel
nRepetition <- 4 # number of repetition for each scenario
nValues <- 4 # number of different values taken by each parameters
genoSize_max <- nrow(g$haplotypes)
genoSize_min <- round(genoSize_max / nValues)
genoSize <- round(seq(genoSize_min, genoSize_max, length.out = nValues))
nCouples_max <- 100 # choose(n = ncol(g$haplotypes)/2, 2)
nCouples_min <- 10 # nCouples_max / nParams_allCombinations
nCouples <- round(seq(nCouples_min, nCouples_max, length.out = nValues))
nProgeny_max <- 20
nProgeny_min <- 10 # nProgeny_max / nParams_allCombinations
nProgeny <- round(seq(nProgeny_min, nProgeny_max, length.out = nValues))
It is hard to use directly the simulation function form the engine because I would need to create input file for each scenario. It will instead use a function similar to the one used in the engine, which will be easier to manage for this profiling. It will return information about the simulation time.
NOTE: The loading time of phased genotype will not be taken in account in the profiling !
profileTime_simul <- function(genoSize, nCouples, nProgeny, haplo, SNPcoord){
# setup simulation data
## Sample SNP's genotype
selectedSNP <- sample(SNPcoord$SNPid, genoSize)
haplo <- haplo[selectedSNP, ]
SNPcoord <- SNPcoord[SNPcoord$SNPid %in% selectedSNP, ]
## create crossing table
allInds <- gsub('_[12]$', '', colnames(haplo)[1:(ncol(haplo)/2)])
crossTable <- expand.grid(allInds, allInds)
crossTable <- crossTable[1:(nrow(crossTable)/2),]
crossTable <- crossTable[sample(nrow(crossTable), nCouples),]
colnames(crossTable) <- c('ind1', 'ind2')
crossTable$n <- nProgeny
crossTable$names <- paste(crossTable$ind1, '_x_', crossTable$ind2)
# simulation
t <- Sys.time()
parentPopulation <- initializeSimulation(haplotypes = haplo,
SNPcoord = SNPcoord)
simulatedIndividuals <- breedSimulatR::makeCrosses(crosses = crossTable,
pop = parentPopulation)
simulatedPop <- breedSimulatR::population$new(inds = simulatedIndividuals,
verbose = FALSE)
simulatedPop$writeVcf(tempfile(fileext = '.vcf.gz'))
simTime <- difftime(Sys.time(), t, units = 's')
return(as.numeric(simTime))
}
profileTime_theory <- function(genoSize, nCouples, haplo, SNPcoord, markerEffects){
# setup simulation data
## Sample SNP's genotype
selectedSNP <- sample(SNPcoord$SNPid, genoSize)
haplo <- haplo[selectedSNP, ]
SNPcoord <- SNPcoord[SNPcoord$SNPid %in% selectedSNP, ]
markerEffects$SNPeffects <- markerEffects$SNPeffects[SNPcoord$SNPid %in% selectedSNP,, drop = FALSE]
## create crossing table
allInds <- gsub('_[12]$', '', colnames(haplo)[1:(ncol(haplo)/2)])
crossTable <- expand.grid(allInds, allInds)
crossTable <- crossTable[1:(nrow(crossTable)/2),]
crossTable <- crossTable[sample(nrow(crossTable), nCouples),]
colnames(crossTable) <- c('ind1', 'ind2')
# calculation:
t <- Sys.time()
# calculate recombination rate
r <- calcRecombRate(SNPcoord)
# initialize results data.frame
blupVarExp <- crossTable[, c('ind1', 'ind2')]
blupVarExp$blup_var <- NA
blupVarExp$blup_exp <- NA
# calculation for each couple
nCrosses <- nrow(crossTable)
for (i in seq(nCrosses)) {
p1.id <- which(grepl(crossTable$ind1[i], colnames(haplo)))
p2.id <- which(grepl(crossTable$ind2[i], colnames(haplo)))
geneticCovar <- calcProgenyGenetCovar(SNPcoord, r, haplo, p1.id, p2.id)
blupVar <- calcProgenyBlupVariance(SNPcoord, markerEffects, geneticCovar)
blupExp <- calcProgenyBlupExpected(SNPcoord, haplo,
p1.id, p2.id, markerEffects)
blupVarExp$blup_var[i] <- blupVar
blupVarExp$blup_exp[i] <- blupExp
}
simTime <- difftime(Sys.time(), t, units = 's')
return(as.numeric(simTime))
}
set.seed(2022)
# prepare simulation parameters for mapply
simulParams_simul <- expand.grid(genoSize = genoSize,
nCouples = nCouples,
nProgeny = nProgeny)
simulParams_simul <- simulParams_simul[rep(seq_len(nrow(simulParams_simul)), each = nRepetition),]
simulParams_simul <- as.list(simulParams_simul[sample(nrow(simulParams_simul)), ])
# simTimes <- mcmapply(profileTime_simul,
simTime <- pbmcmapply(profileTime_simul,
genoSize = simulParams_simul$genoSize,
nCouples = simulParams_simul$nCouples,
nProgeny = simulParams_simul$nProgeny,
MoreArgs = list(haplo = g$haplotypes,
SNPcoord = SNPcoord),
mc.cores = nCores)
if (!is.null(ncol(simTime))) {
simTime <- t(simTime)
}
simulParams_simul <- as.data.frame(simulParams_simul)
results <- cbind(simulParams_simul, simTime)
results <- results[order(results$simTime),]
write.csv(results, 'profilingResults_crossSim.csv', quote = F, row.names = F)
set.seed(2022)
# prepare simulation parameters for mapply
# prepare simulation parameters for mapply
simulParams_theory <- expand.grid(genoSize = genoSize,
nCouples = nCouples)
simulParams_theory <- simulParams_theory[rep(seq_len(nrow(simulParams_theory)), each = nRepetition),]
simulParams_theory <- as.list(simulParams_theory[sample(nrow(simulParams_theory)), ])
profileTime_theory(genoSize = simulParams_theory$genoSize[1],
nCouples = simulParams_theory$nCouples[1],
haplo = g$haplotypes,
SNPcoord = SNPcoord,
markerEffects = markerEffects)
# simTimes <- mcmapply(profileTime_theory,
simTime <- pbmcmapply(profileTime_theory,
genoSize = simulParams_theory$genoSize,
nCouples = simulParams_theory$nCouples,
MoreArgs = list(haplo = g$haplotypes,
SNPcoord = SNPcoord,
markerEffects = markerEffects),
mc.cores = nCores)
if (!is.null(ncol(simTime))) {
simTime <- t(simTime)
}
simulParams_theory <- as.data.frame(simulParams_theory)
results <- cbind(simulParams_theory, simTime)
results <- results[order(results$simTime),]
write.csv(results, 'profilingResults_theory.csv', quote = F, row.names = F)
## [1] 2.591728
dta <- read.csv('profilingResults_crossSim.csv')
simTimeSummary <- dta %>% group_by(genoSize, nCouples, nProgeny) %>% summarise(
meanTime = mean(simTime),
nObs = n()
)
## `summarise()` has grouped output by 'genoSize', 'nCouples'. You can override
## using the `.groups` argument.
simTimeSummary[order(simTimeSummary$meanTime),]
p <- plot_ly(type = "scatter",
mode = "markers",
data = dta,
x = ~genoSize,
y = ~simTime,
color = ~as.factor(nCouples),
symbol = ~nProgeny,
hoverinfo = 'text',
text = apply(dta, 1, function(l) {
paste(names(l), ":", l, collapse = "\n")
})
)
p %>% layout(xaxis = list(type = "log"),
yaxis = list(type = "log"))
p <- plot_ly(mode = "markers",
data = dta,
x = ~genoSize,
y = ~nProgeny,
z = ~simTime,
color = ~as.factor(nCouples))
p
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
dta <- read.csv('profilingResults_theory.csv')
simTimeSummary <- dta %>% group_by(genoSize, nCouples) %>% summarise(
meanTime = mean(simTime),
nObs = n()
)
## `summarise()` has grouped output by 'genoSize'. You can override using the
## `.groups` argument.
simTimeSummary[order(simTimeSummary$meanTime),]
p <- plot_ly(type = "scatter",
mode = "markers",
data = dta,
x = ~genoSize,
y = ~simTime,
color = ~as.factor(nCouples),
hoverinfo = 'text',
text = apply(dta, 1, function(l) {
paste(names(l), ":", l, collapse = "\n")
})
)
p
p %>% layout(#xaxis = list(type = "log"))
yaxis = list(type = "log"))
p <- plot_ly(mode = "markers",
data = dta,
x = ~genoSize,
y = ~nCouples,
z = ~simTime)
p
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## Document generated in:
## Time difference of 9.717672 mins
##
## CPU: AMD Ryzen 5 3600X 6-Core Processor
## Memory total size: 32.79236 GB
##
##
## Session information:
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Pop!_OS 22.04 LTS
##
## Matrix products: default
## BLAS: /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] plotly_4.10.1 ggplot2_3.3.3 dplyr_1.0.5
## [4] microbenchmark_1.4.9 pbmcapply_1.5.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.26 memuse_4.2-1 purrr_0.3.4
## [5] splines_4.1.1 lattice_0.20-41 colorspace_2.0-0 vctrs_0.3.6
## [9] generics_0.1.0 htmltools_0.5.1.1 viridisLite_0.3.0 vcfR_1.12.0
## [13] yaml_2.2.1 mgcv_1.8-33 utf8_1.2.1 rlang_1.0.6
## [17] jquerylib_0.1.4 pillar_1.5.1 glue_1.4.2 withr_2.5.0
## [21] RColorBrewer_1.1-2 lifecycle_1.0.0 stringr_1.4.0 munsell_0.5.0
## [25] gtable_0.3.0 htmlwidgets_1.5.3 evaluate_0.14 knitr_1.31
## [29] permute_0.9-7 crosstalk_1.1.1 fansi_0.4.2 Rcpp_1.0.7
## [33] pinfsc50_1.2.0 scales_1.1.1 vegan_2.6-2 jsonlite_1.7.2
## [37] farver_2.1.0 digest_0.6.27 stringi_1.7.12 grid_4.1.1
## [41] cli_3.6.0 tools_4.1.1 magrittr_2.0.1 lazyeval_0.2.2
## [45] tibble_3.1.0 cluster_2.1.2 crayon_1.4.1 ape_5.6-2
## [49] tidyr_1.1.3 pkgconfig_2.0.3 Matrix_1.3-2 ellipsis_0.3.1
## [53] MASS_7.3-53.1 data.table_1.14.0 rmarkdown_2.11 httr_1.4.2
## [57] rstudioapi_0.13 R6_2.5.0 nlme_3.1-152 compiler_4.1.1